In [1]:
import pandas as pd
import numpy as np
from numba import jit
%load_ext line_profiler
print(__doc__)
from matplotlib import pyplot as plt
from sklearn.datasets import make_checkerboard
from sklearn.datasets import samples_generator as sg
from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.metrics import consensus_score
In [2]:
#Load in data
X = np.loadtxt('data.txt')
In [26]:
@jit(nopython=True)
def gt(a, b):
'''
Helper function to compare floats
a and b must be the same length
'''
result = np.zeros(len(a),dtype=np.float64)
for i in range(len(a)):
result[i] = a[i] >= b[i]
return result
In [27]:
@jit(nopython=True)
def updateU(Bu,tmpI,n,d,ru,tu,winu,z,X,v1,sigsq):
for i in range(0,ru):
luc = tu[tmpI[i]]
paralambda = luc/winu[winu!=0]
tmp= np.multiply(np.sign(z[winu!=0]),gt(np.abs(z[winu!=0]),paralambda))
uc = np.multiply(tmp, np.abs(z[winu!=0])-paralambda)
#Bu[i] = np.sum((X - np.outer(uc,v1))**2)/sigsq + (i+1)*np.log(n*d) #this works best
acc = np.zeros(len(v1))
acccum = 0
for j in range(len(uc)):
for k in range(len(v1)):
acc[k] = acc[k] + (X[j][k] - (uc[j]*v1[k]))**2
for c in range(len(v1)):
acccum += acc[c]
Bu[i] = acccum/sigsq + (i+1)*np.log(n*d)
return Bu
In [28]:
@jit(nopython=True)
def updateV(Bv,tmpI,n,d,rv,tv,winv,z,u0,X,sigsq):
for i in range(0,rv):
lvc = tv[tmpI[i]]
paralambda = lvc/winv[winv!=0]
tmp= np.multiply(np.sign(z[winv!=0]), gt(np.abs(z[winv!=0]),paralambda))
vc = np.multiply(tmp, np.abs(z[winv!=0])-paralambda)
#Bv[i] = np.sum((X - np.outer(u0,vc))**2)/sigsq + (i+1)*np.log(n*d) #this works best
acc = np.zeros(len(vc))
acccum = 0
for j in range(len(u0)):
for k in range(len(vc)):
acc[k] = acc[k] + (X[j][k] - (u0[j]*vc[k]))**2
for c in range(len(vc)):
acccum += acc[c]
Bv[i] = acccum/sigsq + (i+1)*np.log(n*d)
return Bv
In [29]:
def ssvd(X,param=None):
n, d = X.shape
threu = 1
threv = 1
gamu = 0
gamv = 0
t1, t2, t3 = np.linalg.svd(X)
t3 = t3.T
u0 = t1[:,0]
v0 = t3[:,0]
merr = 10**-4
niter = 100
ud = 1
vd = 1
iters = 0
SST = np.sum(X**2)
a = 3.7
while (ud > merr or vd > merr):
iters = iters + 1
z = np.matmul(X.T,u0)
winv = np.abs(z)**gamv
sigsq = (SST - np.sum(z**2))/(n*d-d)
tv = np.sort(np.append(np.abs(z**winv),0))
rv = np.sum(tv>0)
Bv = np.ones((d+1))*np.Inf
tmpI = np.arange(d-1,-1,-1)
Bv = updateV(Bv,tmpI,n,d,rv,tv,winv,z,u0,X,sigsq)
Iv = np.argmin(Bv) + 1
temp = np.sort(np.append(np.abs(np.multiply(z, winv)),0))
lv = temp[d-Iv]
paralambda = np.multiply(lv, winv[winv!=0])
tmp= np.multiply(np.sign(z[winv!=0]),gt(np.abs(z[winv!=0]),paralambda))
v1 = np.multiply(tmp, np.abs(z[winv!=0])-paralambda)
v1 = v1/np.sqrt(np.sum(v1**2)) #v_new
z = np.matmul(X, v1)
winu = np.abs(z)**gamu
sigsq = (SST - np.sum(z**2))/(n*d-n)
tu = np.sort(np.append(np.abs(np.multiply(z, winu)),0))
ru = np.sum((tu>0).astype('int'))
Bu = np.ones((n+1))*np.Inf
tmpI = np.arange(n-1,-1,-1)
Bu = updateU(Bu,tmpI,n,d,ru,tu,winu,z,X,v1,sigsq)
Iu = np.argmin(Bu)+1
temp = np.sort(np.append(np.abs(np.multiply(z, winu)),0))
lu = temp[n-Iu]
paralambda = lu/winu[winu!=0]
tmp= np.multiply(np.sign(z[winu!=0]),gt(np.abs(z[winu!=0]),paralambda))
u1 = np.multiply(tmp, np.abs(z[winu!=0])-paralambda)
u1 = u1/np.sqrt(np.sum(u1**2)) #u_new
ud = np.sqrt(np.sum((u0-u1)**2))
vd = np.sqrt(np.sum((v0-v1)**2))
if iters > niter:
print('Fail to converge! Increase the niter!')
break
u0 = u1
v0 = v1
u = u1
v = v1
return u,v,iters
In [105]:
#Obtain the answer
[u,v,iters] = ssvd(X)
In [8]:
#How long does this function take?
%timeit -r1 -n1 [u,v,iters] = ssvd(X)
In [ ]:
#Use line profiling
#Used primarily for debugging purposes, but can be used to see the processing requirements of the final algorithm
%lprun -s -f ssvd -T ssvd_results.txt ssvd(X)
%cat ssvd_results.txt
In [43]:
n_clusters = (4, 3)
data, rows, columns = make_checkerboard(
shape=(300, 300), n_clusters=n_clusters, noise=10,
shuffle=False, random_state=0)
plt.matshow(data, cmap=plt.cm.coolwarm_r)
plt.title("Original dataset", y=1.15)
data, row_idx, col_idx = sg._shuffle(data, random_state=0)
plt.matshow(data, cmap=plt.cm.coolwarm_r)
plt.title("Shuffled dataset", y=1.15)
model = SpectralBiclustering(n_clusters=n_clusters, method='log',
random_state=0)
model.fit(data)
fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]
plt.matshow(fit_data, cmap=plt.cm.coolwarm_r)
plt.title("After biclustering; rearranged to show biclusters", y=1.15)
plt.show()
In [8]:
#Rank 1
[u1,v1,iters] = ssvd(X)
Xstar1 = np.outer(u1, v1.T)
X = X-Xstar1
#Rank 2
# [u2,v2,iters] = ssvd(X)
# Xstar2 = np.outer(u2, v2.T)
# X = X-Xstar2
# #Rank 3
# [u3,v3,iters] = ssvd(X)
# Xstar3 = np.outer(u3, v3.T)
# X = X-Xstar3
In [9]:
v1sort = np.sort(v1)[::-1] #sort descending
u1sort = np.sort(u1)[::-1] #sort descending
x = np.outer(u1sort, v1sort.T)
x = x/np.max(np.abs(X))
In [14]:
plt.matshow(x.T, cmap=plt.cm.coolwarm_r, aspect='auto');
plt.colorbar()
plt.title('SSVD Algorithm on SSVD Genetics Dataset', y=1.15)
plt.show()
In [18]:
model = SpectralBiclustering(n_clusters=4, method='log',
random_state=0)
model.fit(X)
Out[18]:
In [20]:
fit_data = X[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]
fit_data = np.sort(fit_data)
plt.matshow(fit_data.T, cmap=plt.cm.coolwarm_r, aspect='auto');
plt.colorbar()
plt.title('SKLearn Algorithm on SSVD Genetics Dataset', y=1.15)
plt.show()
In [21]:
#Make bogus data with a checkerboard structure
n_clusters = (2, 2)
data, rows, columns = make_checkerboard(
shape=(12000, 60), n_clusters=n_clusters, noise=4,
shuffle=False, random_state=0)
In [28]:
plt.matshow(data.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("Original Synthesized Dataset", y=1.15)
datashuff, row_idx, col_idx = sg._shuffle(data, random_state=0)
plt.matshow(datashuff.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("Shuffled Synthesized Dataset", y=1.15)
model = SpectralBiclustering(n_clusters=n_clusters, method='log',
random_state=0)
model.fit(datashuff)
fit_data = datashuff[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]
plt.matshow(fit_data.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("After Sklearn biclustering; rearranged to show biclusters on synthesized data", y=1.15)
plt.show()
In [30]:
#Implement our SSVD algorithm on non-sparse synthesized data
[u1,v1,iters] = ssvd(data)
Xstar1 = np.outer(u1, v1.T)
X = data-Xstar1
xmax = np.max(np.abs(X))
In [31]:
v1sort = np.sort(v1)[::-1] #sort descending
u1sort = np.sort(u1)[::-1] #sort descending
x = np.outer(u1sort, v1sort.T)
xfake = x/xmax
In [32]:
plt.matshow(data.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("Original Synthesized Dataset", y=1.15)
datashuff, row_idx, col_idx = sg._shuffle(data, random_state=0)
plt.matshow(datashuff.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("Shuffled Synthesized Dataset", y=1.15)
plt.matshow(xfake.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("After SSVD biclustering; rearranged to show biclusters on Synthesized Data", y=1.15)
plt.show()
In [1]:
# The ground truth is a 100×50 matrix X∗ whose nonzero entries are the same.
# This is the simplest example because every nonzero element of X∗ is the same,
# and hence is equally likely to be chosen by a sparse procedure.
In [9]:
# u is a unit vector of length 100 with ui = 1/√50 for i = 1, . . . , 50, and ui = 0 otherwise
ufirst = (1/np.sqrt(50))*np.ones((50,1))
ulast = np.zeros((50,1))
u = np.hstack((ufirst.flatten(), ulast.flatten()))
In [11]:
# v is a unit vector of length 50 with vj = 1/5 for j = 1, . . . , 25, and vj = 0 otherwise.
vfirst = (1/5)*np.ones((25,1))
vlast = np.zeros((25,1))
v = np.hstack((vfirst.flatten(), vlast.flatten()))
In [14]:
# X* = suvT is a rank one matrix with uniform nonzero entries. s is set to 30
s = 30
Xstar = s*np.outer(u, v.T)
Xstar
Out[14]:
In [16]:
print(Xstar.shape)
In [22]:
noise = np.random.standard_normal(size=Xstar.shape)
noise
Out[22]:
In [24]:
#The input matrix X is the summation of the true signal, Xstar, and noise from the standard normal distribution
X = Xstar + noise
X
Out[24]:
In [40]:
plt.matshow(Xstar.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("Ground Truth Sparse Synthesized Dataset", y=1.15)
plt.matshow(X.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("Ground Truth Sparse Synthesized Dataset Plus Noise", y=1.15)
datashuff, row_idx, col_idx = sg._shuffle(X, random_state=0)
plt.matshow(datashuff.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("Shuffled Sparse Synthesized Dataset", y=1.15)
#Implement our SSVD algorithm on synthesized sparse data
[u1,v1,iters] = ssvd(datashuff)
Xstar1 = np.outer(u1, v1.T)
Xbi = X-Xstar1
xmax = np.max(np.abs(Xbi))
v1sort = np.sort(v1)[::-1] #sort descending
u1sort = np.sort(u1)[::-1] #sort descending
x = np.outer(u1sort, v1sort.T)
Xbi = x/xmax
plt.matshow(Xbi.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("After SSVD biclustering; rearranged to show biclusters on Synthesized Data", y=1.15)
plt.colorbar()
plt.show()
In [41]:
plt.matshow(X.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("Ground Truth Sparse Synthesized Dataset Plus Noise", y=1.15)
datashuff, row_idx, col_idx = sg._shuffle(X, random_state=0)
plt.matshow(datashuff.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("Shuffled Sparse Synthesized Dataset", y=1.15)
model = SpectralBiclustering(n_clusters=2, method='log',
random_state=0)
model.fit(datashuff)
fit_data = datashuff[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]
plt.matshow(fit_data.T, cmap=plt.cm.coolwarm_r, aspect='auto')
plt.title("After Sklearn biclustering; rearranged to show biclusters on synthesized data", y=1.15)
plt.show()
In [ ]: